Import data and set objective

if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load("knitr")

knitr::opts_knit$set(root.dir = "~/source/big_data/Group6/datasets")
df.train <- read.csv("../datasets/NS.TRAIN.csv")
df.test <- read.csv("../datasets/NS.TEST.csv")

Importing the DF

dim(df.train)
## [1] 88416    35
table(is.na(df.train))
## 
##   FALSE 
## 3094560
Dimantion of the DataFrame and missing values
summary(df.train)
##    patient_id         appointment         week_day        schedule_date  
##  Min.   :3.920e+04   Min.   :5030230   Min.   :1.000   2016-05-03: 3389  
##  1st Qu.:4.156e+12   1st Qu.:5640302   1st Qu.:2.000   2016-05-02: 3364  
##  Median :3.168e+13   Median :5680628   Median :3.000   2016-05-16: 3311  
##  Mean   :1.470e+14   Mean   :5675348   Mean   :2.857   2016-05-05: 3285  
##  3rd Qu.:9.433e+13   3rd Qu.:5725620   3rd Qu.:4.000   2016-05-10: 3179  
##  Max.   :9.999e+14   Max.   :5790484   Max.   :6.000   2016-05-09: 3152  
##                                                        (Other)   :68736  
##    appointmnet_date  waiting_time         age           is_female     
##  2016-06-06: 3736   Min.   :  0.00   Min.   :  0.00   Min.   :0.0000  
##  2016-05-16: 3693   1st Qu.:  0.00   1st Qu.: 18.00   1st Qu.:0.0000  
##  2016-05-30: 3636   Median :  4.00   Median : 37.00   Median :1.0000  
##  2016-06-07: 3589   Mean   : 10.18   Mean   : 37.13   Mean   :0.6502  
##  2016-06-08: 3586   3rd Qu.: 15.00   3rd Qu.: 55.00   3rd Qu.:1.0000  
##  2016-05-11: 3580   Max.   :179.00   Max.   :115.00   Max.   :1.0000  
##  (Other)   :66596                                                     
##   scholarship              neighbourhood       region         poverty     
##  Min.   :0.00000   Jardim Camburi : 6183   Min.   :1.000   Min.   :0.000  
##  1st Qu.:0.00000   Maria Ortiz    : 4635   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.00000   Resistencia    : 3517   Median :4.000   Median :4.000  
##  Mean   :0.09842   Jardim da Penha: 3105   Mean   :4.427   Mean   :3.184  
##  3rd Qu.:0.00000   Itarare        : 2797   3rd Qu.:6.000   3rd Qu.:4.000  
##  Max.   :1.00000   Centro         : 2695   Max.   :9.000   Max.   :5.000  
##                    (Other)        :65484                                  
##      x_coor           y_coor        hipertension       diabetes      
##  Min.   :-20.51   Min.   :-40.84   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:-20.31   1st Qu.:-40.33   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :-20.30   Median :-40.31   Median :0.0000   Median :0.00000  
##  Mean   :-20.16   Mean   :-40.30   Mean   :0.1975   Mean   :0.07185  
##  3rd Qu.:-20.27   3rd Qu.:-40.30   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :-14.86   Max.   :-29.33   Max.   :1.0000   Max.   :1.00000  
##                                                                      
##    alcoholism         handcap         sms_recieved       same_day     
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   : 1.000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.: 1.000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median : 1.000  
##  Mean   :0.03019   Mean   :0.02264   Mean   :0.3216   Mean   : 1.187  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.: 1.000  
##  Max.   :1.00000   Max.   :4.00000   Max.   :1.0000   Max.   :10.000  
##                                                                       
##   week_before       ever_before     known_week_after known_ever_after
##  Min.   : 0.0000   Min.   : 0.000   Min.   :0        Min.   :0       
##  1st Qu.: 0.0000   1st Qu.: 0.000   1st Qu.:0        1st Qu.:0       
##  Median : 0.0000   Median : 0.000   Median :0        Median :0       
##  Mean   : 0.4039   Mean   : 1.187   Mean   :0        Mean   :0       
##  3rd Qu.: 0.0000   3rd Qu.: 1.000   3rd Qu.:0        3rd Qu.:0       
##  Max.   :21.0000   Max.   :83.000   Max.   :0        Max.   :0       
##                                                                      
##  all_known_appointments    no_show          region_1      
##  Min.   :0              Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0              1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0              Median :0.0000   Median :0.00000  
##  Mean   :0              Mean   :0.2024   Mean   :0.08092  
##  3rd Qu.:0              3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :0              Max.   :1.0000   Max.   :1.00000  
##                                                           
##     region_2         region_3         region_4         region_5      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.1423   Mean   :0.1359   Mean   :0.2288   Mean   :0.03698  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##                                                                      
##     region_6         region_7         region_8          region_9       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00e+00  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00e+00  
##  Median :0.0000   Median :0.0000   Median :0.00000   Median :0.00e+00  
##  Mean   :0.1418   Mean   :0.1633   Mean   :0.06993   Mean   :2.26e-05  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00e+00  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00e+00  
## 

Display the summarize and define a business target

Business question
<<<<<<< HEAD ======= <<<<<<< HEAD >>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
  1. Is it possible to predict no show to Dr. appointments?
Predictor columnds explained
  • patient_id - Patient ID
  • appointment - Appointment ID
  • week_day - Numeric week day (1-6)
  • schedule_date - Schedule date
  • appointmnet_date - Appointmnet date
  • waiting_time - Diff between Schedule and Appointmnet
  • age - Age of patient
  • is_female - Sex (boolean)
  • scholarship - Scholarshop recieved (boolean)
  • neighbourhood - Neighbourhood
  • hipertension
  • diabetes
  • alcoholism
  • handcap
  • sms_recieved
  • poverty - Neighbourhood poverty rank
  • x_coor - Langtatude
  • y_coor - Latatude
  • region - Neighbourhood numeric region
  • no_show - Showed to appointment (boolean)
<<<<<<< HEAD ======= =======

TBD ##### Predictor columnds explained * patient_id - Patient ID * appointment - Appointment ID * week_day - Numeric week day (1-6) * schedule_date - Schedule date * appointmnet_date - Appointmnet date * waiting_time - Diff between Schedule and Appointmnet * age - Age of patient * is_female - Sex (boolean) * scholarship - Scholarshop recieved (boolean) * neighbourhood - Neighbourhood * hipertension * diabetes * alcoholism * handcap * sms_recieved * poverty - Neighbourhood poverty rank * x_coor - Langtatude * y_coor - Latatude * region - Neighbourhood numeric region * no_show - Showed to appointment (boolean)

>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7 >>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e

EDA and IDA

Numeric and Categorical
<<<<<<< HEAD ======= <<<<<<< HEAD ======= >>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7 >>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
numeric categorical
patient_id schedule_date
appointment appointmnet_date
week_day neighbourhood
waiting_time poverty
age -
is_female -
scholarship -
- -
which columns have missing values
sapply(df.train, function(x) sum(is.na(x)))
##             patient_id            appointment               week_day 
##                      0                      0                      0 
##          schedule_date       appointmnet_date           waiting_time 
##                      0                      0                      0 
##                    age              is_female            scholarship 
##                      0                      0                      0 
##          neighbourhood                 region                poverty 
##                      0                      0                      0 
##                 x_coor                 y_coor           hipertension 
##                      0                      0                      0 
##               diabetes             alcoholism                handcap 
##                      0                      0                      0 
##           sms_recieved               same_day            week_before 
##                      0                      0                      0 
##            ever_before       known_week_after       known_ever_after 
##                      0                      0                      0 
## all_known_appointments                no_show               region_1 
##                      0                      0                      0 
##               region_2               region_3               region_4 
##                      0                      0                      0 
##               region_5               region_6               region_7 
##                      0                      0                      0 
##               region_8               region_9 
##                      0                      0

number of missing values per column

which columns to predict

Column to predict is no_show

Regression or Classification

since the Y column is numeric and categorical, the problem is a classification problem

Numeric column histograms
df = na.omit(df.train)
par(mfrow=c(2,4));
hist(df$week_day, main="week_day", breaks = 10)
hist(log(df$waiting_time+1), main="log waiting_time", breaks = 10)
hist(sqrt(df$age), main="sqrt age", breaks = 10)
hist(df$poverty, main="poverty", breaks = 10)
hist(df$region, main="region", breaks = 10)
hist(log(df$same_day+1), main="log same_day", breaks = 10)
hist(log(df$week_before+1), main="log week_before", breaks = 10)
hist(log(df$ever_before+1), main="log ever_before", breaks = 10)

EDA

High probability graphs

# looking for no_show frequencies over waiting time
eda_waiting_time<-aggregate(df.train$no_show, by=list(log(df.train$waiting_time+1)), FUN=mean)
names(eda_waiting_time)=c("waiting time","No show prob")
plot(eda_waiting_time, eda_waiting_time$x, type="p", main="No show prob vs. log waiting time")

We can identify a group of confident shows around 100 waiting time (days), or 4.5 in log, whereas the majority is around probability of 0.3

eda_week_day<-aggregate(df.train$no_show, by=list(df.train$week_day), FUN=mean)
plot(eda_week_day, eda_week_day$x, type="l", main="No show prob vs. week_day")

<<<<<<< HEAD
eda_age<-aggregate(df.train$no_show, by=list(sqrt(df.train$age)), FUN=mean)
plot(eda_age, eda_age$x, type="p",  main="No show prob vs. sqrt age")

======= <<<<<<< HEAD
eda_age<-aggregate(df.train$no_show, by=list(sqrt(df.train$age)), FUN=mean)
plot(eda_age, eda_age$x, type="p", main="No show prob vs. sqrt age")

=======
eda_age<-aggregate(df.train$no_show, by=list((df.train$age)), FUN=mean)
plot(eda_age, eda_age$x, type="p", main="No show prob vs. age")

>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7 >>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
eda_scholarship<-aggregate(df.train$no_show, by=list((df.train$scholarship)), FUN=mean)
plot(eda_scholarship, eda_scholarship$x, type="h", col = c("light blue", "blue"), lwd = 100, ylim = c(0.1,0.4), main="No show prob vs. scholarship")

eda_sms_recieved<-aggregate(df.train$no_show, by=list((df.train$sms_recieved)), FUN=mean)
plot(eda_sms_recieved, eda_sms_recieved$x, type="h", col = c("light blue", "blue"), lwd = 100, ylim = c(0.1,0.4), main="No show prob vs. sms_recieved")

#### Low probability graphs

par(mfrow=c(2,4))
eda_is_female<-aggregate(df.train$no_show, by=list(df.train$is_female), FUN=mean)
plot(eda_is_female, eda_is_female$x, type="l", main="No show prob vs. is_female", ylim=c(0,1))

eda_poverty<-aggregate(df.train$no_show, by=list((df.train$poverty)), FUN=mean)
plot(eda_poverty, eda_poverty$x, type="l", main="No show prob vs. poverty", ylim=c(0,1))

eda_diabetes<-aggregate(df.train$no_show, by=list((df.train$diabetes)), FUN=mean)
plot(eda_diabetes, eda_diabetes$x, type="l", main="No show prob vs. diabetes", ylim=c(0,1))

eda_hipertension<-aggregate(df.train$no_show, by=list((df.train$hipertension)), FUN=mean)
plot(eda_hipertension, eda_hipertension$x, type="l", main="No show prob vs. hipertension", ylim=c(0,1))

eda_handcap<-aggregate(df.train$no_show, by=list((df.train$handcap)), FUN=mean)
plot(eda_handcap, eda_handcap$x, type="l", main="No show prob vs. handcap", ylim=c(0,1))

eda_same_day<-aggregate(df.train$no_show, by=list((df.train$same_day)), FUN=mean)
plot(eda_same_day, eda_same_day$x, type="l", main="No show prob vs. same_day", ylim=c(0,1))

eda_week_before<-aggregate(df.train$no_show, by=list((df.train$week_before)), FUN=mean)
plot(eda_week_before, eda_week_before$x, type="l", main="No show prob vs. week_before", ylim=c(0,1))

eda_ever_before<-aggregate(df.train$no_show, by=list((df.train$ever_before)), FUN=mean)
plot(eda_ever_before, eda_ever_before$x, type="l", main="No show prob vs. ever_before", ylim=c(0,1))

Modeling

<<<<<<< HEAD

Logistic Model

logit_model <- glm(no_show ~ age+
                         waiting_time+
=======
<<<<<<< HEAD
# converting to log and sqrt
df.train$log_waiting_time <- log(df.train$waiting_time+1)
df.train$sqrt_age <- sqrt(df.train$age)

LM

logit_model <- glm(no_show ~ sqrt_age+
                         log_waiting_time+
                         scholarship+
                         sms_recieved+
                         region, data = df.train, family = binomial)
summary (logit_model)
## 
## Call:
## glm(formula = no_show ~ sqrt_age + log_waiting_time + scholarship + 
##     sms_recieved + region, family = binomial, data = df.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4793  -0.7161  -0.4799  -0.3951   2.3429  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.927551   0.031864 -60.494  < 2e-16 ***
## sqrt_age         -0.060380   0.003797 -15.903  < 2e-16 ***
## log_waiting_time  0.562168   0.007544  74.521  < 2e-16 ***
## scholarship       0.272104   0.028144   9.668  < 2e-16 ***
## sms_recieved     -0.173436   0.020176  -8.596  < 2e-16 ***
## region           -0.022630   0.004172  -5.424 5.84e-08 ***
=======

LM

df.train$log_waiting_time <- log(df.train$waiting_time+1)
logit_model <- glm(no_show ~ age+
                         log_waiting_time+
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
                         scholarship+
                         sms_recieved+
                         region, data = df.train, family = binomial)
summary (logit_model)
## 
## Call:
## glm(formula = no_show ~ age + waiting_time + scholarship + sms_recieved + 
##     region, family = binomial, data = df.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3459  -0.6745  -0.5807  -0.4953   2.1922  
## 
## Coefficients:
<<<<<<< HEAD
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.3894915  0.0248995 -55.804  < 2e-16 ***
## age          -0.0075171  0.0003781 -19.882  < 2e-16 ***
## waiting_time  0.0228074  0.0005443  41.903  < 2e-16 ***
## scholarship   0.2121474  0.0274403   7.731 1.07e-14 ***
## sms_recieved  0.3624426  0.0187984  19.280  < 2e-16 ***
## region       -0.0306387  0.0040925  -7.487 7.07e-14 ***
=======
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -2.0784036  0.0213372 -97.407   <2e-16 ***
## age              -0.0078358  0.0003867 -20.262   <2e-16 ***
## log_waiting_time  0.5632924  0.0075464  74.644   <2e-16 ***
## scholarship       0.2538919  0.0281669   9.014   <2e-16 ***
## sms_recieved     -0.1799441  0.0202056  -8.906   <2e-16 ***
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 89074  on 88415  degrees of freedom
<<<<<<< HEAD
## Residual deviance: 85524  on 88410  degrees of freedom
## AIC: 85536
## 
## Number of Fisher Scoring iterations: 4
plot(logit_model)

======= <<<<<<< HEAD ## Residual deviance: 81699 on 88410 degrees of freedom ## AIC: 81711 ## ## Number of Fisher Scoring iterations: 4
plot(logit_model)

======= ## Residual deviance: 81565 on 88411 degrees of freedom ## AIC: 81575 ## ## Number of Fisher Scoring iterations: 4
plot(logit_model)

>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7 >>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e

CART

pacman::p_load("tree")
noshow.CART <- tree(no_show ~ week_day+
<<<<<<< HEAD
                      log_waiting_time+
                      sqrt_age+
=======
                      waiting_time+
                      age+
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
                      is_female+
                      scholarship+
                      hipertension+
                      diabetes+
                      alcoholism+
                      handcap+
                      sms_recieved+
                      poverty+
                      region ,data = df.train)
plot(noshow.CART)
text(noshow.CART, pretty = 0, cex=0.5)
<<<<<<< HEAD

summary(noshow.CART)
## 
## Regression tree:
## tree(formula = no_show ~ week_day + log_waiting_time + sqrt_age + 
##     is_female + scholarship + hipertension + diabetes + alcoholism + 
##     handcap + sms_recieved + poverty + region, data = df.train)
## Variables actually used in tree construction:
## [1] "log_waiting_time" "sqrt_age"        
=======

summary(noshow.CART)
## 
## Regression tree:
## tree(formula = no_show ~ week_day + waiting_time + age + is_female + 
##     scholarship + hipertension + diabetes + alcoholism + handcap + 
##     sms_recieved + poverty + region, data = df.train)
## Variables actually used in tree construction:
## [1] "waiting_time" "age"         
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
## Number of terminal nodes:  3 
## Residual mean deviance:  0.1466 = 12960 / 88410 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.33060 -0.33060 -0.04566  0.00000 -0.04566  0.95430

RF

<<<<<<< HEAD
pacman::p_load("randomForest")
set.seed(7)
noshow.RF <- randomForest(no_show ~ week_day
                          +waiting_time
                          +age
                          +is_female
                          +scholarship
                          +sms_recieved
                          +poverty
                          +region
                          , data = df.train, na.action=na.omit, type="classification", ntree=100) 
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
plot(noshow.RF)

#importance(noshow.RF)
varImpPlot(noshow.RF)

=======
library("randomForest")
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(7)
noshow.RF <- randomForest(no_show ~ week_day+
<<<<<<< HEAD
                         log_waiting_time+
                         sqrt_age+
=======
                         waiting_time+
                         age+
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
                         is_female+
                         scholarship+
                         sms_recieved+
                         poverty+
                         region, data = df.train, na.action=na.omit) 
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
plot(noshow.RF)
<<<<<<< HEAD

#importance(noshow.RF)
varImpPlot(noshow.RF)

=======

#importance(noshow.RF)
varImpPlot(noshow.RF)

>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7 >>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e

GBM

# install.packages("gbm",repos = "http://cran.us.r-project.org")
#library("gbm")
pacman::p_load("gbm")
set.seed(7) #same seed to repeat the RF
no_show.gbm <- gbm (no_show ~ week_day+
<<<<<<< HEAD
                         log_waiting_time+
                         sqrt_age+
=======
                         waiting_time+
                         age+
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
                         is_female+
                         scholarship+
                         sms_recieved+
                         poverty+
<<<<<<< HEAD
                         region ,data = df.train, n.trees = 1000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
======= <<<<<<< HEAD region ,data = df.train, n.trees = 1000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
## Distribution not specified, assuming bernoulli ...
no_show.gbm
## gbm(formula = no_show ~ week_day + log_waiting_time + sqrt_age + 
##     is_female + scholarship + sms_recieved + poverty + region, 
##     data = df.train, n.trees = 1000, interaction.depth = 4, shrinkage = 0.2, 
##     verbose = F)
=======
                         region,data = df.train, n.trees = 1000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
## Distribution not specified, assuming bernoulli ...
no_show.gbm
## gbm(formula = no_show ~ week_day + waiting_time + age + is_female + 
##     scholarship + sms_recieved + poverty + region, data = df.train, 
##     n.trees = 1000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
## A gradient boosted model with bernoulli loss function.
## 1000 iterations were performed.
## There were 8 predictors of which 8 had non-zero influence.
summary(no_show.gbm)

<<<<<<< HEAD
##                               var   rel.inf
## log_waiting_time log_waiting_time 56.837348
## sqrt_age                 sqrt_age 22.970307
## region                     region  5.406216
## poverty                   poverty  4.646453
## week_day                 week_day  4.380362
## sms_recieved         sms_recieved  2.836217
## is_female               is_female  1.653932
## scholarship           scholarship  1.269164
=======
##                       var   rel.inf
## waiting_time waiting_time 56.837348
## age                   age 22.970307
## region             region  5.406216
## poverty           poverty  4.646453
## week_day         week_day  4.380362
## sms_recieved sms_recieved  2.836217
## is_female       is_female  1.653932
## scholarship   scholarship  1.269164
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7

Model evaluation

<<<<<<< HEAD

Logistic Model

threshold = 0.6
=======
<<<<<<< HEAD
# converting the test for log and sqrt
df.test$log_waiting_time <- log(df.test$waiting_time+1)
df.test$sqrt_age <- sqrt(df.test$age)

LM

threshold = 0.6
=======

LM

threshold = 0.6
df.test$log_waiting_time <- log(df.test$waiting_time+1)
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
fitted.lm.results <- predict(logit_model,df.test,type='response')
lm.prediction <- ifelse(fitted.lm.results > threshold,1,0)
lm.accuracy <- mean(lm.prediction == df.test$no_show)
lm.accuracy
<<<<<<< HEAD
## [1] 0.7996381
======= <<<<<<< HEAD
## [1] 0.8000905
=======
## [1] 0.800181
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7 >>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
#install.packages("caret",repos = "http://cran.us.r-project.org") 
#library("caret")
#install.packages("e1071",repos = "http://cran.us.r-project.org")
#library("e1071")
pacman::p_load("caret")
pacman::p_load("e1071")
confusionMatrix(data = lm.prediction, reference =  df.test$no_show)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
<<<<<<< HEAD
##          0 17660  4402
##          1    27    16
=======
<<<<<<< HEAD
##          0 17685  4417
##          1     2     1
##                                           
##                Accuracy : 0.8001          
##                  95% CI : (0.7948, 0.8053)
##     No Information Rate : 0.8001          
##     P-Value [Acc > NIR] : 0.5107          
##                                           
##                   Kappa : 2e-04           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9998869       
##             Specificity : 0.0002263       
##          Pos Pred Value : 0.8001538       
##          Neg Pred Value : 0.3333333       
##              Prevalence : 0.8001357       
##          Detection Rate : 0.8000452       
##    Detection Prevalence : 0.9998643       
##       Balanced Accuracy : 0.5000566       
=======
##          0 17686  4416
##          1     1     2
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
##                                           
##                Accuracy : 0.7996          
##                  95% CI : (0.7943, 0.8049)
##     No Information Rate : 0.8001          
##     P-Value [Acc > NIR] : 0.5773          
##                                           
##                   Kappa : 0.0033          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
<<<<<<< HEAD
##             Sensitivity : 0.998473        
##             Specificity : 0.003622        
##          Pos Pred Value : 0.800471        
##          Neg Pred Value : 0.372093        
##              Prevalence : 0.800136        
##          Detection Rate : 0.798914        
##    Detection Prevalence : 0.998055        
##       Balanced Accuracy : 0.501048        
=======
##             Sensitivity : 0.9999435       
##             Specificity : 0.0004527       
##          Pos Pred Value : 0.8001991       
##          Neg Pred Value : 0.6666667       
##              Prevalence : 0.8001357       
##          Detection Rate : 0.8000905       
##    Detection Prevalence : 0.9998643       
##       Balanced Accuracy : 0.5001981       
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
##                                           
##        'Positive' Class : 0               
## 
cross.table <- table(lm.prediction, df.test$no_show)

l <- nrow(cross.table)
if(l< 2)  {
          cross.table <- rbind(cross.table, c(0,0))
          }

accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
<<<<<<< HEAD
f1 <- 2*(recall*precision)/(recall+precision)

paste("Accuracy -",accuracy)
## [1] "Accuracy - 0.799638090929654"
paste("Precision -",precision)
## [1] "Precision - 0.372093023255814"
paste("Recall -",recall)
## [1] "Recall - 0.00362154821186057"
paste("F1 -",f1)
## [1] "F1 - 0.00717327953373683"
#install.packages("pROC",repos = "http://cran.us.r-project.org")
pacman::p_load("pROC")
plot(roc(df.test$no_show, fitted.lm.results, direction="<"), col="blue", main="Left ROC curve")

CART

threshold = 0.3
=======
paste("accuracy -",accuracy)
<<<<<<< HEAD
## [1] "accuracy - 0.800090477267587"
paste("precision -",precision)
## [1] "precision - 0.333333333333333"
paste("recall -",recall)
## [1] "recall - 0.000226346763241286"
=======
## [1] "accuracy - 0.800180954535173"
paste("precision -",precision)
## [1] "precision - 0.666666666666667"
paste("recall -",recall)
## [1] "recall - 0.000452693526482571"
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
#install.packages("pROC",repos = "http://cran.us.r-project.org")
library("pROC")
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
plot(roc(df.test$no_show, fitted.lm.results, direction="<"), col="blue", main="Left ROC curve")
<<<<<<< HEAD

CART

threshold = 0.3
=======

CART

threshold = 0.2
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
fitted.cart.results <- predict(noshow.CART,df.test)
summary(fitted.cart.results)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.04566 0.04566 0.22663 0.20236 0.33062 0.33062
cart.prediction <- ifelse(fitted.cart.results > threshold,1,0)
summary(cart.prediction)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
<<<<<<< HEAD
##  0.0000  0.0000  0.0000  0.3766  1.0000  1.0000
======= <<<<<<< HEAD ## 0.0000 0.0000 0.0000 0.3766 1.0000 1.0000
======= ## 0.0000 0.0000 1.0000 0.6495 1.0000 1.0000
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7 >>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
cross.table <- table(cart.prediction, df.test$no_show)

l <- nrow(cross.table)
if(l< 2)  {
          cross.table <- rbind(cross.table, c(0,0))
          }

accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
<<<<<<< HEAD
f1 <- 2*(recall*precision)/(recall+precision)

paste("Accuracy -",accuracy)
## [1] "Accuracy - 0.663605519113323"
paste("Precision -",precision)
## [1] "Precision - 0.318716962998558"
paste("Recall -",recall)
## [1] "Recall - 0.600497962879131"
paste("F1 -",f1)
## [1] "F1 - 0.416418144718255"
======= paste("accuracy -",accuracy) <<<<<<< HEAD
## [1] "accuracy - 0.663605519113323"
paste("precision -",precision)
## [1] "precision - 0.318716962998558"
paste("recall -",recall)
## [1] "recall - 0.600497962879131"

RF

threshold <- 0.6
fitted.rf.results <- predict(noshow.RF,df.test)
summary(fitted.rf.results)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02370 0.05721 0.21576 0.20392 0.30558 0.67046
rf.prediction <- ifelse(fitted.rf.results > threshold,1,0)
summary(rf.prediction)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0000000 0.0000000 0.0002262 0.0000000 1.0000000
=======
## [1] "accuracy - 0.515539470707985"
paste("precision -",precision)
## [1] "precision - 0.280908267743958"
paste("recall -",recall)
## [1] "recall - 0.912856496152105"
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e

RF

threshold <- 0.5
fitted.rf.results <- predict(noshow.RF,df.test)
summary(fitted.rf.results)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02125 0.05853 0.21249 0.20366 0.30583 0.69628
rf.prediction <- ifelse(fitted.rf.results > threshold,1,0)
summary(rf.prediction)
<<<<<<< HEAD
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.00846 0.00000 1.00000
cross.table <- table(rf.prediction, df.test$no_show)

l <- nrow(cross.table)
if(l< 2)  {
          cross.table <- rbind(cross.table, c(0,0))
          }

accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
f1 <- 2*(recall*precision)/(recall+precision)

paste("Accuracy -",accuracy)
## [1] "Accuracy - 0.801266681746211"
paste("Precision -",precision)
## [1] "Precision - 0.566844919786096"
paste("Recall -",recall)
## [1] "Recall - 0.0239927569035763"
paste("F1 -",f1)
## [1] "F1 - 0.0460369163952226"

GBM

threshold <- 0.6
=======
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.006876 0.000000 1.000000
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
cross.table <- table(rf.prediction, df.test$no_show)
cross.table
##              
## rf.prediction     0     1
<<<<<<< HEAD
##             0 17687  4413
##             1     0     5
======= ## 0 17632 4321 ## 1 55 97
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
paste("accuracy -",accuracy)
<<<<<<< HEAD
## [1] "accuracy - 0.800361909070346"
paste("precision -",precision)
## [1] "precision - 1"
paste("recall -",recall)
## [1] "recall - 0.00113173381620643"

GBM

threshold <- 0.6
=======
## [1] "accuracy - 0.802035738520697"
paste("precision -",precision)
## [1] "precision - 0.638157894736842"
paste("recall -",recall)
## [1] "recall - 0.0219556360344047"

GBM

threshold <- 0.1
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
fitted.gbm.results <- predict(no_show.gbm,df.test, n.trees = 1000)
summary(fitted.gbm.results)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -5.0588 -3.0654 -1.3217 -1.7214 -0.8148  1.7028
gbm.prediction <- ifelse(fitted.gbm.results > threshold,1,0)
summary(gbm.prediction)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
<<<<<<< HEAD
## 0.00000 0.00000 0.00000 0.00294 0.00000 1.00000
cross.table <- table(gbm.prediction, df.test$no_show)

l <- nrow(cross.table)
if(l< 2)  {
          cross.table <- rbind(cross.table, c(0,0))
          }
accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
f1 <- 2*(recall*precision)/(recall+precision)

paste("Accuracy -",accuracy)
## [1] "Accuracy - 0.800452386337933"
paste("Precision -",precision)
## [1] "Precision - 0.553846153846154"
paste("Recall -",recall)
## [1] "Recall - 0.00814848347668628"
paste("F1 -",f1)
## [1] "F1 - 0.0160606736560339"
======= <<<<<<< HEAD ## 0.00000 0.00000 0.00000 0.00294 0.00000 1.00000
======= ## 0.00000 0.00000 0.00000 0.01515 0.00000 1.00000 >>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
cross.table <- table(gbm.prediction, df.test$no_show)
cross.table
##               
## gbm.prediction     0     1
<<<<<<< HEAD
##              0 17658  4382
##              1    29    36
======= ## 0 17517 4253 ## 1 170 165 >>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
paste("accuracy -",accuracy)
<<<<<<< HEAD
## [1] "accuracy - 0.800452386337933"
paste("precision -",precision)
## [1] "precision - 0.553846153846154"
paste("recall -",recall)
## [1] "recall - 0.00814848347668628"
=======
## [1] "accuracy - 0.799909522732413"
paste("precision -",precision)
## [1] "precision - 0.492537313432836"
paste("recall -",recall)
## [1] "recall - 0.0373472159348121"
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7 >>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e